Machine Learning


In [2]:
from __future__ import division
import random

Avoiding Overfitting


In [4]:
def split_data(data, prob):
    """splits the data into fractions [prob, 1-prob]"""
    results = [], []
    
    for row in data:
        results[0 if random.random() < prob else 1].append(row)
    return results


def train_test_split(x, y, test_pct):
    data = zip(x,y)
    train, test = split_data(data, 1- test_pct)
    
    # unzip trick
    x_train, y_train =  zip(*train)
    x_test, y_test = zip(*test)
    
    return x_train, x_test, y_train, y_test

Metrics


In [8]:
def accuracy(tp, fp, fn, tn):
    correct = tp + tn
    total = tp + fp + fn + tn
    return correct / total

def precision(tp, fp, fn, tn):
    return tp / (tp + fp)

def recall(tp, fp, fn, tn):
    return tp / (tp + fn)

def f1_scoare(tp, fp, fn, tn):
    p = precision(tp, fp, fn, tn)
    r = recall(tp, fp, fn, tn)
    
    return 2 * p * r / (p + r)

In [9]:
# bias is poor performance even on training
# variance is poor performance across many different trainings

# bias can be solved by adding features
# varaince can be solved by removing features or getting data

In [ ]: